Loading packages:
library(ggplot2)
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
library(kableExtra)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
x_1 <- runif(1000, min = 0, max = 1)
x_1_plot <- as.data.frame(x_1) %>%
ggplot(aes(x = x_1)) +
geom_histogram(binwidth=0.01, fill = '#34495E') +
theme(panel.background =
element_rect(fill ="white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of x_1") +
ylab("Count")
ggplotly(x_1_plot)
### Why do I have to use plotly here? Without it, R generates the plot but does
### not display it?
u_plus_1 <- rchisq(1000, df =1)
u_plus_1_plot <- as.data.frame(u_plus_1) %>%
ggplot(aes( x = u_plus_1)) +
geom_histogram(binwidth = 0.1, fill = '#34495E') +
theme(panel.background =
element_rect(fill = "white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of u+1") +
ylab("Count")
ggplotly(u_plus_1_plot)
###Question 2
# a)
f2_a <- function(N = 5)
{
x <- rchisq(df = 1, N)
return(c(mean(x), sd(x))) # returns a vector containing the mean and the standard deviation of the generated values.
}
# b)
f2_b <- function( n=10000,N = 5 ){
MEAN <- c() # To create a MEAN vector
SD <- c() # To create a SD vector
for (i in 1:n)
{
df <- f2_a(N)
MEAN[i] <- df[1] # It attributes the mean to the vector 'MEAN'
SD[i] <- df[2] # It attributes the standard deviation to the vector 'SD'
}
return (data.frame(MEAN,SD))
# We get the desired dataset in a dataframe format.
}
simulated_chi <- f2_b()
# c)
simulation_means_plot <- as.data.frame(simulated_chi[, 1]) %>%
ggplot(aes(x = simulated_chi[, 1])) +
geom_histogram(binwidth=0.3, fill = '#34495E') +
theme(panel.background =
element_rect(fill ="white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of Means") +
ylab("Count")
ggplotly(simulation_means_plot)
# N = 10 :
q3a <- f2_b(N=10)
summary(q3a[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.07237 0.66471 0.93040 0.99057 1.24218 4.27836
# N = 100 :
q3b <- f2_b(N=100)
summary(q3b[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5789 0.9000 0.9887 0.9976 1.0871 1.6027
# N = 1000 :
q3c <- f2_b(N=1000)
summary(q3c[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8243 0.9699 0.9998 1.0005 1.0307 1.1773
# We observe that all summary statistics tend to 1 the greater the N.
###Question 4
sd_n_10 <- sd(simulated_chi[, 1])
print(sd_n_10)
## [1] 0.6325471
multi_4a <- sd_n_10*(sqrt(10)/sqrt(100))
print(multi_4a)
## [1] 0.2000289
multi_4b <- sd_n_10*(sqrt(10)/sqrt(1000))
print(multi_4b)
## [1] 0.06325471
multi_4c <- sd_n_10*(sqrt(10)/sqrt(3500000))
print(multi_4c)
## [1] 0.0010692
The second multiplication divides the standard deviation by 10. Through guessing I obtain a sample size of N approx. = 3 500 000.
# N = 10 :
q4a <- f2_b(N=10)
hist(q3a[,1])
# N = 100 :
q4b <- f2_b(N=100)
hist(q3b[,1])
# N = 1000 :
q4c <- f2_b(N=1000)
hist(q3c[,1])
# We observe the mean becomes approximately more and more
# normally distributed.
# Playing around with ggplot2 for my own amusement:
#a)
q4a_plot <- as.data.frame(q4a) %>%
ggplot(aes(x = MEAN)) +
geom_histogram(binwidth=0.05, fill = '#34495E') +
theme(panel.background =
element_rect(fill ="white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of Means") +
ylab("Count")
ggplotly(q4a_plot)
#b)
q4b_plot <- as.data.frame(q4b) %>%
ggplot(aes(x = MEAN)) +
geom_histogram(binwidth=0.05, fill = '#34495E') +
theme(panel.background =
element_rect(fill ="white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of Means") +
ylab("Count")
ggplotly(q4b_plot)
#c)
q4c_plot <- as.data.frame(q4c) %>%
ggplot(aes(x = MEAN)) +
geom_histogram(binwidth=0.01, fill = '#34495E') +
theme(panel.background =
element_rect(fill ="white"),
axis.title.y = element_text(color = '#17202A'),
axis.title.x = element_text(color = '#17202A')) +
xlab("Distribution of Means") +
ylab("Count")
ggplotly(q4c_plot)
How could I create an alternative chart that overlapps the three distributions to show that they become more and more Gaussian? For example, how could I obtain a graph with three bell-shaped curves for each of the distributions?
# a)
rm(list = ls())
N = 10
# b)
x1 <- runif(N, min = 0, max = 1)
x2 <- rbinom(N, 1, 0.3)
u <- rchisq(N, df = 1)
y <- 1 + 2 * x1 + 10 * x2 + u
# c)
regression_q6 <- lm(y ~ x1 + x2)
# d)
regression_q6$coefficients
## (Intercept) x1 x2
## 2.4733578 0.2912243 12.0992154
sd <- c(sd(x1),sd(x2),sd(u))
# Creating a function to generate data:
function1_q6 <- function(N = 10){
x1 <- runif(N, min = 0, max = 1)
x2 <- rbinom(N, 1, 0.3)
u <- rchisq(N, df = 1)
y <- 1 + 2*x1 + 10*x2 + u
regression_q6 <- lm(y ~ x1 + x2)
return(c(regression_q6$coefficients, "sd(x1)"=sd(x1), "sd(x2)"=sd(x2), "sd(u)"=sd(u)))
}
# Second, a function to simulate this several times and collect the data.
function2_q6 <- function(n=10000, N=10){
df <- data.frame(matrix(0, ncol = 6, nrow = 0))
names(df) <- c("Intercept",
"First coef",
"Second coef",
"Sd(x1)",
"Sd(x2)",
"Sd(u)")
for (i in 1:n){
d <- function1_q6(N)
for (j in 1:6){
df[i,j] <- d[j]
}
}
return(df)
}
df <- function2_q6()
#a)
# With N=10:
df_1 <- function2_q6(N=10)
summary(df_1[, 1]) # To summarize the intercept
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.342 1.323 1.827 1.999 2.518 14.046
summary(df_1[,2]) # To summarize the first coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.009 1.042 2.018 2.017 2.988 18.703
summary(df_1[,3]) # To summarize the second coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.913 9.334 9.891 9.997 10.558 23.032 278
hist(df_1[, 1])
hist(df_1[,2])
hist(df_1[,3])
#b)
# With N=1000:
df_2 <- function2_q6(N=1000)
summary(df_2[, 1]) # To summarize the intercept
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.600 1.937 1.999 2.001 2.063 2.332
summary(df_2[,2]) # To summarize the first coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.384 1.898 1.998 2.000 2.102 2.575
summary(df_2[,3]) # To summarize the second coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.643 9.931 9.997 9.998 10.063 10.390
hist(df_2[, 1])
hist(df_2[,2])
hist(df_2[,3])
# With N=1000:
df_3 <- function2_q6(N=10000)
summary(df_3[, 1]) # To summarize the intercept
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.886 1.979 2.000 2.000 2.020 2.117
summary(df_3[,2]) # To summarize the first coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.818 1.967 2.000 2.000 2.034 2.193
summary(df_3[,3]) # To summarize the second coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.886 9.979 10.000 10.000 10.020 10.121
hist(df_3[, 1])
hist(df_3[,2])
hist(df_3[,3])
We can see that as we increase sample size, the estimates converge around the values we predetermined 2 and 10.
Can I add an option to the histogram to modify the label?
# a)
rm(list = ls())
N = 10
# b)
x1 <- runif(N, min = 0, max = 1)
x2 <- rbinom(N, 1, 0.3)
u <- rchisq(N, df = 1)
y <- 1 + 2 * x1 + 10 * x2 + u
# c)
regression_q8 <- lm(y ~ x1 + x2)
# d)
function1_q8 <- function(N = 10){
x1 <- runif(N, min = 0, max = 1)
x2 <- rbinom(N, 1, 0.3)
u <- rchisq(N, df = 1)
y <- 1 + 2*x1 + 10*x2 + u
regression_q8 <- lm(y ~ x1 + x2)
summary_regq8 <- summary(regression_q8)
return(c(regression_q8$coefficients, "sd(x1)"=sd(x1), "sd(x2)"=sd(x2), "sd(u)"=sd(u), summary_regq8$fstatistic))
}
# Second, a function to simulate this several times and collect the data.
function2_q8 <- function(n=10000, N=10){
df_q8 <- data.frame(matrix(0, ncol = 7, nrow = 0))
names(df_q8) <- c("Intercept",
"First coef",
"Second coef",
"Sd(x1)",
"Sd(x2)",
"Sd(u)",
"F-stat")
for (i in 1:n){
d <- function1_q8(N)
for (j in 1:7){
df_q8[i,j] <- d[j]
}
}
return(df_q8)
}
df_q8 <- function2_q8()
summary(df_q8)
## Intercept First coef Second coef Sd(x1)
## Min. :-5.038 Min. :-13.616 Min. : 1.686 Min. :0.07118
## 1st Qu.: 1.331 1st Qu.: 1.030 1st Qu.: 9.342 1st Qu.:0.25441
## Median : 1.826 Median : 2.014 Median : 9.887 Median :0.28672
## Mean : 2.003 Mean : 1.995 Mean : 9.995 Mean :0.28484
## 3rd Qu.: 2.503 3rd Qu.: 2.954 3rd Qu.:10.553 3rd Qu.:0.31745
## Max. :11.929 Max. : 14.807 Max. :18.825 Max. :0.42010
## NA's :290
## Sd(x2) Sd(u) F-stat
## Min. :0.0000 Min. :0.09707 Min. : 0.001
## 1st Qu.:0.4216 1st Qu.:0.77403 1st Qu.: 34.624
## Median :0.4830 Median :1.11730 Median : 76.927
## Mean :0.4474 Mean :1.24991 Mean : 176.528
## 3rd Qu.:0.5164 3rd Qu.:1.58767 3rd Qu.: 174.690
## Max. :0.5270 Max. :6.16029 Max. :17223.124
##
#8.1)
# With N=10:
df_q8_1 <- function2_q8(N=10)
summary(df_q8[, 1]) # To summarize the intercept
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.038 1.331 1.826 2.003 2.503 11.929
summary(df_q8[,2]) # To summarize the first coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.616 1.030 2.014 1.995 2.954 14.807
summary(df_q8[,3]) # To summarize the second coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.686 9.342 9.887 9.995 10.553 18.825 290
summary(df_q8[, 7])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 34.624 76.927 176.528 174.690 17223.124
hist(df_q8[, 1])
hist(df_q8[,2])
hist(df_q8[,3])
hist(df_q8[, 7])
#8.2)
# With N=1000:
df_q8_2 <- function2_q8(N=1000)
summary(df_q8[, 1]) # To summarize the intercept
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.038 1.331 1.826 2.003 2.503 11.929
summary(df_q8[,2]) # To summarize the first coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.616 1.030 2.014 1.995 2.954 14.807
summary(df_q8[,3]) # To summarize the second coefficient
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.686 9.342 9.887 9.995 10.553 18.825 290
summary(df_q8[, 7])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 34.624 76.927 176.528 174.690 17223.124
hist(df_q8[, 1])
hist(df_q8[,2])
hist(df_q8[,3])
hist(df_q8[, 7])